library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.2
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(rstanarm)
## Warning: package 'rstanarm' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.2
## This is rstanarm version 2.21.3
## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
library(ggplot2)
library(data.table)
## Warning: package 'data.table' was built under R version 4.2.2
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(stringr)
library(ggsoccer)
## Warning: package 'ggsoccer' was built under R version 4.2.3
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.2.2
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(stringi)
## Warning: package 'stringi' was built under R version 4.2.2
We first want to load in the events data, which split across multiple csv files, grouped by country. To read this in, I have defined a loop that iterates through the data directory and appends each file to a data-frame.
#Define Data Directory
dir_path <- "../Data/events"
#Get a list of files in the directory
file_list <- list.files(dir_path)
#Create an empty data frame to store the file contents
actions <- data.frame()
#Loop through the files and add their contents to the data frame
for (i in 1:length(file_list)) {
#Read the file into a data frame
file_data <- fread(file.path(dir_path, file_list[i]))
#Add the file data to the main data frame
actions <- rbind(actions, file_data)
}
#View the events data frame
actions
We next want to load in the player data
#Read in data
players <- fromJSON("../Data/players/players.json")
#View player data
players
Since the event data contains all sorts of event types, we now want to filter out only the shots.
# Extract observations of shots from the actions data
shots_df <- actions %>% dplyr::filter(subEventName == "Shot")
The next section is dedicated to loading the event data
We now use documentation from here to mutate the data, and garner more useful categorical data about the shot itself.
shots_df <- shots_df %>%
#If the shot is successful
mutate('is_goal' = ifelse(grepl(" 101}", shots_df$tags),1,0),
#If the shot is at the end of a counter-attack
'is_CA' = ifelse(grepl(" 1901}", shots_df$tags),1,0),
#If the shot is with the foot or another part of the body
'body_part' = ifelse(grepl(" 401}", shots_df$tags),"left",
ifelse(grepl(" 402}", shots_df$tags), "right",
ifelse(grepl(" 403}", shots_df$tags), "head/body", "NA"))),
#If the shot is blocked
'is_blocked' = ifelse(grepl(" 2101}", shots_df$tags), 1,0))
#Filter out only unblocked shots
shots_df <- shots_df %>% dplyr::filter(is_blocked == 0)
#Keep necessary categorical data
shots_cat <- dplyr::select(shots_df, c('playerId', 'is_goal', 'is_CA', 'body_part'))
summary(shots_cat)
## playerId is_goal is_CA body_part
## Min. : 0 Min. :0.0000 Min. :0.00000 Length:32851
## 1st Qu.: 11066 1st Qu.:0.0000 1st Qu.:0.00000 Class :character
## Median : 25707 Median :0.0000 Median :0.00000 Mode :character
## Mean : 93637 Mean :0.1366 Mean :0.05793
## 3rd Qu.:142755 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :552555 Max. :1.0000 Max. :1.00000
We now calculate the distance from the goal and angle to goal for each shot.
First, we would like to define the position of each shot on the pitch
#Extract all numeric entries from the positions column
pos <- str_extract_all(gsub('"', "", shots_df$positions), "\\d+")
#Define vectors to store coordinates
x_pos <- c()
y_pos <- c()
#Loops that extract the coordinates
for (i in 1:length(pos)){
x_pos <- append(x_pos, pos[[i]][2])
}
for (i in 1:length(pos)){
y_pos <- append(y_pos, pos[[i]][1])
}
#Convert coordinates to numeric data
x_pos <- x_pos %>% as.numeric()
y_pos <- y_pos %>% as.numeric()
# Create coordinate dataframe
coords <- data.frame(x_pos, y_pos)
We can now use these coordinates to calculate distance and angle to goal
#Define length and width of pitch
pitch_x <- 105
pitch_y <- 68
#We now convert coordinates to meters
x_meter <- coords$x_pos * pitch_x/100
y_meter <- coords$y_pos * pitch_y/100
# Calculate distances
dist <- sqrt((105 - x_meter)^2 + ((32.5) - y_meter)^2)
#Calculate angles
angles <- atan( (7.32 * (105 - x_meter) ) / ( (105 - x_meter)^2 + (32.5 - y_meter)^2 - (7.32/2)^2 )) * 180/pi
We can now merge our useful event data into one data-frame.
#Concatenate data-frames
shots <- data.frame(shots_cat, dist, angles)
The next section is dedicated to loading the player data. For this, we simply filter only by the features that will prove useful later on.
First lets retrieve a vector of all unique players in the current
shots data-base:
#Retrieves unique player ids
player_list <- unique(shots$playerId)
We can now filter the players data-frame to only include
these players
#Filters by player ids between both data frames
shooters <- dplyr::filter(players, wyId %in% player_list)
We can filter this data-frame by the features we need.
#Selects necessary columns
shooters <- dplyr::select(shooters, c('shortName', 'wyId', 'foot'))
Finally, we rename the some columns, for ease later on.
#Renames columns
colnames(shooters)[colnames(shooters) == "wyId"] <- "playerId"
colnames(shooters)[colnames(shooters) == "foot"] <- "preferred_foot"
We will now introduce a preferred foot binary variable.
First we merge all our useful data into one data-frame
#Concatenate data-frames
shots <- merge(shots, shooters, by = "playerId")
We now mutate this to add a column featuring the desired binary variable
#Adds preferred foot binary column
shots <- shots %>%
mutate(preferred_foot_b = ifelse(shots$preferred_foot == shots$body_part, 1, 0))
Finally, we remove the preferred_foot column
#Removes desired column
shots <- shots %>% dplyr::select(-c("preferred_foot"))
Since much of our data is categorical, it is necessary to convert it to the factor type.
#Convert necessary variables to factor
shots$is_goal <- shots$is_goal %>% as.factor()
shots$body_part <- shots$body_part %>% as.factor()
shots$is_CA <- shots$is_CA %>% as.factor()
shots$preferred_foot_b <- shots$preferred_foot_b %>% as.factor()
shots$shortName <- shots$shortName %>% as.factor()
Now lets view a summary of our data
summary(shots)
## playerId is_goal is_CA body_part dist
## Min. : 36 0:28360 0:30946 head/body: 6645 Min. : 0.54
## 1st Qu.: 11066 1: 4489 1: 1903 left :10225 1st Qu.: 11.56
## Median : 25707 right :15979 Median : 15.99
## Mean : 93642 Mean : 17.88
## 3rd Qu.:142755 3rd Qu.: 24.22
## Max. :552555 Max. :103.97
##
## angles shortName preferred_foot_b
## Min. :-87.00 Cristiano Ronaldo : 155 0:12623
## 1st Qu.: 14.40 L. Insigne : 139 1:20226
## Median : 19.64 H. Kane : 137
## Mean : 23.54 E. D\\u017eeko : 121
## 3rd Qu.: 30.59 L. Messi : 121
## Max. : 89.66 I. Peri\\u0161i\\u0107: 117
## (Other) :32059
If we view a random subset of our data, we observe a problem decoding unicode characters:
shots[90:100,]
So we use the following chunk to decode them
shots$shortName <- stringi::stri_unescape_unicode(shots$shortName)
We can see from the summary there are negative angles in the data, to investigate this further we can look at a histogram
hist(shots$angles)
We observe that there are multiple negative angles. Since most of the angles are correctly positive, we will remove the negative ones from the analysis.
shots <- shots %>% dplyr::filter(shots$angles > 0)
To see the corrected histogram
hist(shots$angles)
Later on we will use playerId to group the data. Since
our data-set is large and spans many countries, there are many different
players in the data-set
length(table(shots$playerId))
## [1] 2292
We see there are 2292 unique player included in the data-set
With this in hand, it would be sensible to limit the amount of “groups” (players) to, say 50. In order to preserve the greatest amount of data, we will use the top 50 most occurring player names.
top_players <- sort(table(shots$playerId), decreasing = T)[1:50]
Now we filter the data based on these players
top_shots <- dplyr::filter(shots, playerId %in% row.names(top_players))
We now view a summary of our final data
summary(top_shots)
## playerId is_goal is_CA body_part dist
## Min. : 122 0:3299 0:3716 head/body: 756 Min. : 3.679
## 1st Qu.: 7905 1: 727 1: 310 left :1329 1st Qu.:10.974
## Median : 14945 right :1941 Median :14.749
## Mean : 53939 Mean :16.472
## 3rd Qu.: 25716 3rd Qu.:20.832
## Max. :447804 Max. :98.898
## angles shortName preferred_foot_b
## Min. : 2.041 Length:4026 0:1539
## 1st Qu.:15.430 Class :character 1:2487
## Median :22.007 Mode :character
## Mean :25.839
## 3rd Qu.:32.098
## Max. :89.660
To better understand our distance and angle data, we can create the following boxplots
#Defines and distance boxplot
dist_boxplot <- ggplot(top_shots, aes(x=is_goal, y=dist, fill = is_goal)) +
geom_boxplot() +
labs(title="Distributions Of Distances Grouped By Shot Outcome",
x="Shot Outcome",
y="Distance To Goal (m)") +
coord_flip()
dist_boxplot <- dist_boxplot + guides(fill=guide_legend(title="Goal (1) or Not (0)"))
#Defines angles boxplot
angles_boxplot <- ggplot(top_shots, aes(x=is_goal, y=angles, fill = is_goal)) +
geom_boxplot() +
labs(title="Distributions Of Angles Grouped By Shot Outcome",
x="Shot Outcome",
y="Angle To Goal (Degrees)") +
coord_flip()
angles_boxplot <- angles_boxplot + guides(fill=guide_legend(title="Goal (1) or Not (0)"))
#Plots distance boxplot
dist_boxplot
#Plots angles boxplot
angles_boxplot
From these we observe there must is likely some relationship between the
outcome of a shot and the position from which it is taken.
We now visualise the frequency at which each player takes a shot
Firstly, we need to wrangle the data a bit more:
#Create data-frame from top_players table defined earlier
top_players_df <- data.frame(top_players)
#Rename columns
colnames(top_players_df)[colnames(top_players_df) == "Var1"] <- "playerId"
colnames(top_players_df)[colnames(top_players_df) == "Freq"] <- "shotVolume"
#We add a column containing player name
top_players_df <- merge(top_players_df, distinct(top_shots[, c("playerId", "shortName")]), by = "playerId")
#We create a dataframe where the is_goal variable is numeric
numeric_goals <- top_shots[, c("shortName", "is_goal")]
numeric_goals$is_goal <- as.numeric(as.character(numeric_goals$is_goal))
#We sum up goals scored by player
summed_goals <- numeric_goals %>%
group_by(shortName) %>%
summarise(goals = sum(is_goal))
#Merge to final data-frame
shots_goals <- merge(top_players_df, summed_goals, by = "shortName")
#Sort in descending order by shot volume
shots_goals <- arrange(shots_goals, desc(shotVolume))
shots_goals$shortName <- shots_goals$shortName %>% as.factor()
Now we visualise the results
shots_goals_long <- gather(shots_goals, key = var, value = value, shotVolume, goals)
shots_goals_plot <- ggplot(shots_goals_long, aes(x=reorder(shortName, -value), y = value, fill = var)) +
geom_col(position = "identity", width = 0.9) +
labs(title="Shots And Goals By Player",
x="Players",
y="Volume") +
scale_x_discrete(guide = guide_axis(angle = 60))
shots_goals_plot <- shots_goals_plot + guides(fill=guide_legend(title="Key"))
shots_goals_plot
We can see from the plot, that there is some difference in a players
ability to convert a shot. We can exploit this difference by adding
another level to our models.
Since our data-set is somewhat small, it would be wise to have an uneven split of test and train data. This is carried out in the following chunk.
# Split into test and train subsets
train.size <- 0.8 * nrow(top_shots)
train <- sample(1:nrow(top_shots), train.size)
test <- -train
shots.train <- top_shots[train, ]
shots.test <- top_shots[test, ]
is_goal.test <- top_shots$is_goal[test]
is_goal ~ distFirst we will fit some simple logistic regression models
glm1 <- glm(is_goal ~ dist, data = top_shots, family = binomial())
summary(glm1)
##
## Call:
## glm(formula = is_goal ~ dist, family = binomial(), data = top_shots)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0883 -0.6975 -0.5261 -0.2792 3.5203
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.226364 0.110297 2.052 0.0401 *
## dist -0.119491 0.007803 -15.313 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3802.7 on 4025 degrees of freedom
## Residual deviance: 3495.2 on 4024 degrees of freedom
## AIC: 3499.2
##
## Number of Fisher Scoring iterations: 5
is_goal ~ dist + anglesglm2 <- glm(is_goal ~ dist + angles, data = top_shots, family = binomial())
summary(glm2)
##
## Call:
## glm(formula = is_goal ~ dist + angles, family = binomial(), data = top_shots)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4259 -0.6616 -0.5080 -0.3281 3.1229
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.008302 0.283171 -3.561 0.00037 ***
## dist -0.074710 0.011819 -6.321 2.60e-10 ***
## angles 0.020640 0.004428 4.661 3.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3802.7 on 4025 degrees of freedom
## Residual deviance: 3473.2 on 4023 degrees of freedom
## AIC: 3479.2
##
## Number of Fisher Scoring iterations: 5
is_goal ~ .glm3 <- glm(is_goal ~ . - shortName - playerId, data = top_shots, family = binomial())
summary(glm3)
##
## Call:
## glm(formula = is_goal ~ . - shortName - playerId, family = binomial(),
## data = top_shots)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8279 -0.6449 -0.4795 -0.3011 3.3185
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.827369 0.313812 -5.823 5.78e-09 ***
## is_CA1 0.488807 0.150439 3.249 0.00116 **
## body_partleft 1.030124 0.146097 7.051 1.78e-12 ***
## body_partright 0.787840 0.171598 4.591 4.41e-06 ***
## dist -0.091253 0.012458 -7.325 2.39e-13 ***
## angles 0.026604 0.004658 5.712 1.12e-08 ***
## preferred_foot_b1 0.209729 0.126546 1.657 0.09745 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3802.7 on 4025 degrees of freedom
## Residual deviance: 3370.6 on 4019 degrees of freedom
## AIC: 3384.6
##
## Number of Fisher Scoring iterations: 5